batman-adv: Merge bugfixes from 2025.0 1105/head
authorSven Eckelmann <[email protected]>
Fri, 7 Feb 2025 21:13:24 +0000 (22:13 +0100)
committerSven Eckelmann <[email protected]>
Fri, 7 Feb 2025 21:22:48 +0000 (22:22 +0100)
* force stop of throughput detection workers on interface removal

Signed-off-by: Sven Eckelmann <[email protected]>
batman-adv/Makefile
batman-adv/patches/0016-batman-adv-fix-panic-during-interface-removal.patch [new file with mode: 0644]
batman-adv/patches/0017-batman-adv-Ignore-neighbor-throughput-metrics-in-err.patch [new file with mode: 0644]
batman-adv/patches/0018-batman-adv-Drop-unmanaged-ELP-metric-worker.patch [new file with mode: 0644]

index ba54ffc40491b956105716fd911aabcd45da2a88..68abf8e5dd04031c7990e57a03e620aa4d5e7d06 100644 (file)
@@ -4,7 +4,7 @@ include $(TOPDIR)/rules.mk
 
 PKG_NAME:=batman-adv
 PKG_VERSION:=2023.1
-PKG_RELEASE:=9
+PKG_RELEASE:=10
 
 PKG_SOURCE:=$(PKG_NAME)-$(PKG_VERSION).tar.gz
 PKG_SOURCE_URL:=https://downloads.open-mesh.org/batman/releases/batman-adv-$(PKG_VERSION)
diff --git a/batman-adv/patches/0016-batman-adv-fix-panic-during-interface-removal.patch b/batman-adv/patches/0016-batman-adv-fix-panic-during-interface-removal.patch
new file mode 100644 (file)
index 0000000..09be282
--- /dev/null
@@ -0,0 +1,71 @@
+From: Andy Strohman <[email protected]>
+Date: Thu, 9 Jan 2025 02:27:56 +0000
+Subject: batman-adv: fix panic during interface removal
+
+Reference counting is used to ensure that
+batadv_hardif_neigh_node and batadv_hard_iface
+are not freed before/during
+batadv_v_elp_throughput_metric_update work is
+finished.
+
+But there isn't a guarantee that the hard if will
+remain associated with a soft interface up until
+the work is finished.
+
+This fixes a crash triggered by reboot that looks
+like this:
+
+Call trace:
+ batadv_v_mesh_free+0xd0/0x4dc [batman_adv]
+ batadv_v_elp_throughput_metric_update+0x1c/0xa4
+ process_one_work+0x178/0x398
+ worker_thread+0x2e8/0x4d0
+ kthread+0xd8/0xdc
+ ret_from_fork+0x10/0x20
+
+(the batadv_v_mesh_free call is misleading,
+and does not actually happen)
+
+I was able to make the issue happen more reliably
+by changing hardif_neigh->bat_v.metric_work work
+to be delayed work. This allowed me to track down
+and confirm the fix.
+
+Fixes: 5c3245172c01 ("batman-adv: ELP - compute the metric based on the estimated throughput")
+Signed-off-by: Andy Strohman <[email protected]>
+[[email protected]: prevent entering batadv_v_elp_get_throughput without
+ soft_iface]
+Signed-off-by: Sven Eckelmann <[email protected]>
+Origin: upstream, https://git.open-mesh.org/batman-adv.git/commit/1d7c8ac629678b8cd23ef9def7c7b111cb9e8ed5
+
+--- a/net/batman-adv/bat_v_elp.c
++++ b/net/batman-adv/bat_v_elp.c
+@@ -66,12 +66,19 @@ static void batadv_v_elp_start_timer(str
+ static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh)
+ {
+       struct batadv_hard_iface *hard_iface = neigh->if_incoming;
++      struct net_device *soft_iface = hard_iface->soft_iface;
+       struct ethtool_link_ksettings link_settings;
+       struct net_device *real_netdev;
+       struct station_info sinfo;
+       u32 throughput;
+       int ret;
++      /* don't query throughput when no longer associated with any
++       * batman-adv interface
++       */
++      if (!soft_iface)
++              return BATADV_THROUGHPUT_DEFAULT_VALUE;
++
+       /* if the user specified a customised value for this interface, then
+        * return it directly
+        */
+@@ -141,7 +148,7 @@ static u32 batadv_v_elp_get_throughput(s
+ default_throughput:
+       if (!(hard_iface->bat_v.flags & BATADV_WARNING_DEFAULT)) {
+-              batadv_info(hard_iface->soft_iface,
++              batadv_info(soft_iface,
+                           "WiFi driver or ethtool info does not provide information about link speeds on interface %s, therefore defaulting to hardcoded throughput values of %u.%1u Mbps. Consider overriding the throughput manually or checking your driver.\n",
+                           hard_iface->net_dev->name,
+                           BATADV_THROUGHPUT_DEFAULT_VALUE / 10,
diff --git a/batman-adv/patches/0017-batman-adv-Ignore-neighbor-throughput-metrics-in-err.patch b/batman-adv/patches/0017-batman-adv-Ignore-neighbor-throughput-metrics-in-err.patch
new file mode 100644 (file)
index 0000000..2ef52c9
--- /dev/null
@@ -0,0 +1,127 @@
+From: Sven Eckelmann <[email protected]>
+Date: Wed, 22 Jan 2025 21:51:20 +0100
+Subject: batman-adv: Ignore neighbor throughput metrics in error case
+
+If a temporary error happened in the evaluation of the neighbor throughput
+information, then the invalid throughput result should not be stored in the
+throughtput EWMA.
+
+Signed-off-by: Sven Eckelmann <[email protected]>
+Origin: upstream, https://git.open-mesh.org/batman-adv.git/commit/ce8b40b7bdcc7f011191acda4c2d3067566eef54
+
+--- a/net/batman-adv/bat_v_elp.c
++++ b/net/batman-adv/bat_v_elp.c
+@@ -59,11 +59,13 @@ static void batadv_v_elp_start_timer(str
+ /**
+  * batadv_v_elp_get_throughput() - get the throughput towards a neighbour
+  * @neigh: the neighbour for which the throughput has to be obtained
++ * @pthroughput: calculated throughput towards the given neighbour in multiples
++ *  of 100kpbs (a value of '1' equals 0.1Mbps, '10' equals 1Mbps, etc).
+  *
+- * Return: The throughput towards the given neighbour in multiples of 100kpbs
+- *         (a value of '1' equals 0.1Mbps, '10' equals 1Mbps, etc).
++ * Return: true when value behind @pthroughput was set
+  */
+-static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh)
++static bool batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh,
++                                      u32 *pthroughput)
+ {
+       struct batadv_hard_iface *hard_iface = neigh->if_incoming;
+       struct net_device *soft_iface = hard_iface->soft_iface;
+@@ -77,14 +79,16 @@ static u32 batadv_v_elp_get_throughput(s
+        * batman-adv interface
+        */
+       if (!soft_iface)
+-              return BATADV_THROUGHPUT_DEFAULT_VALUE;
++              return false;
+       /* if the user specified a customised value for this interface, then
+        * return it directly
+        */
+       throughput =  atomic_read(&hard_iface->bat_v.throughput_override);
+-      if (throughput != 0)
+-              return throughput;
++      if (throughput != 0) {
++              *pthroughput = throughput;
++              return true;
++      }
+       /* if this is a wireless device, then ask its throughput through
+        * cfg80211 API
+@@ -111,19 +115,24 @@ static u32 batadv_v_elp_get_throughput(s
+                        * possible to delete this neighbor. For now set
+                        * the throughput metric to 0.
+                        */
+-                      return 0;
++                      *pthroughput = 0;
++                      return true;
+               }
+               if (ret)
+                       goto default_throughput;
+-              if (sinfo.filled & BIT(NL80211_STA_INFO_EXPECTED_THROUGHPUT))
+-                      return sinfo.expected_throughput / 100;
++              if (sinfo.filled & BIT(NL80211_STA_INFO_EXPECTED_THROUGHPUT)) {
++                      *pthroughput = sinfo.expected_throughput / 100;
++                      return true;
++              }
+               /* try to estimate the expected throughput based on reported tx
+                * rates
+                */
+-              if (sinfo.filled & BIT(NL80211_STA_INFO_TX_BITRATE))
+-                      return cfg80211_calculate_bitrate(&sinfo.txrate) / 3;
++              if (sinfo.filled & BIT(NL80211_STA_INFO_TX_BITRATE)) {
++                      *pthroughput = cfg80211_calculate_bitrate(&sinfo.txrate) / 3;
++                      return true;
++              }
+               goto default_throughput;
+       }
+@@ -142,8 +151,10 @@ static u32 batadv_v_elp_get_throughput(s
+                       hard_iface->bat_v.flags &= ~BATADV_FULL_DUPLEX;
+               throughput = link_settings.base.speed;
+-              if (throughput && throughput != SPEED_UNKNOWN)
+-                      return throughput * 10;
++              if (throughput && throughput != SPEED_UNKNOWN) {
++                      *pthroughput = throughput * 10;
++                      return true;
++              }
+       }
+ default_throughput:
+@@ -157,7 +168,8 @@ default_throughput:
+       }
+       /* if none of the above cases apply, return the base_throughput */
+-      return BATADV_THROUGHPUT_DEFAULT_VALUE;
++      *pthroughput = BATADV_THROUGHPUT_DEFAULT_VALUE;
++      return true;
+ }
+ /**
+@@ -169,15 +181,21 @@ void batadv_v_elp_throughput_metric_upda
+ {
+       struct batadv_hardif_neigh_node_bat_v *neigh_bat_v;
+       struct batadv_hardif_neigh_node *neigh;
++      u32 throughput;
++      bool valid;
+       neigh_bat_v = container_of(work, struct batadv_hardif_neigh_node_bat_v,
+                                  metric_work);
+       neigh = container_of(neigh_bat_v, struct batadv_hardif_neigh_node,
+                            bat_v);
+-      ewma_throughput_add(&neigh->bat_v.throughput,
+-                          batadv_v_elp_get_throughput(neigh));
++      valid = batadv_v_elp_get_throughput(neigh, &throughput);
++      if (!valid)
++              goto put_neigh;
++
++      ewma_throughput_add(&neigh->bat_v.throughput, throughput);
++put_neigh:
+       /* decrement refcounter to balance increment performed before scheduling
+        * this task
+        */
diff --git a/batman-adv/patches/0018-batman-adv-Drop-unmanaged-ELP-metric-worker.patch b/batman-adv/patches/0018-batman-adv-Drop-unmanaged-ELP-metric-worker.patch
new file mode 100644 (file)
index 0000000..4b811aa
--- /dev/null
@@ -0,0 +1,239 @@
+From: Sven Eckelmann <[email protected]>
+Date: Wed, 22 Jan 2025 21:51:21 +0100
+Subject: batman-adv: Drop unmanaged ELP metric worker
+
+The ELP worker needs to calculate new metric values for all neighbors
+"reachable" over an interface. Some of the used metric sources require
+locks which might need to sleep. This sleep is incompatible with the RCU
+list iterator used for the recorded neighbors. The initial approach to work
+around of this problem was to queue another work item per neighbor and then
+run this in a new context.
+
+Even when this solved the RCU vs might_sleep() conflict, it has a major
+problems: Nothing was stopping the work item in case it is not needed
+anymore - for example because one of the related interfaces was removed or
+the batman-adv module was unloaded - resulting in potential invalid memory
+accesses.
+
+Directly canceling the metric worker also has various problems:
+
+* cancel_work_sync for a to-be-deactivated interface is called with
+  rtnl_lock held. But the code in the ELP metric worker also tries to use
+  rtnl_lock() - which will never return in this case. This also means that
+  cancel_work_sync would never return because it is waiting for the worker
+  to finish.
+* iterating over the neighbor list for the to-be-deactivated interface is
+  currently done using the RCU specific methods. Which means that it is
+  possible to miss items when iterating over it without the associated
+  spinlock - a behaviour which is acceptable for a periodic metric check
+  but not for a cleanup routine (which must "stop" all still running
+  workers)
+
+The better approch is to get rid of the per interface neighbor metric
+worker and handle everything in the interface worker. The original problems
+are solved by:
+
+* creating a list of neighbors which require new metric information inside
+  the RCU protected context, gathering the metric according to the new list
+  outside the RCU protected context
+* only use rcu_trylock inside metric gathering code to avoid a deadlock
+  when the cancel_delayed_work_sync is called in the interface removal code
+  (which is called with the rtnl_lock held)
+
+Fixes: 5c3245172c01 ("batman-adv: ELP - compute the metric based on the estimated throughput")
+Signed-off-by: Sven Eckelmann <[email protected]>
+Origin: upstream, https://git.open-mesh.org/batman-adv.git/commit/405d49a20a205799ec453cadb3d078dc2808d563
+
+--- a/net/batman-adv/bat_v.c
++++ b/net/batman-adv/bat_v.c
+@@ -113,8 +113,6 @@ static void
+ batadv_v_hardif_neigh_init(struct batadv_hardif_neigh_node *hardif_neigh)
+ {
+       ewma_throughput_init(&hardif_neigh->bat_v.throughput);
+-      INIT_WORK(&hardif_neigh->bat_v.metric_work,
+-                batadv_v_elp_throughput_metric_update);
+ }
+ /**
+--- a/net/batman-adv/bat_v_elp.c
++++ b/net/batman-adv/bat_v_elp.c
+@@ -18,6 +18,7 @@
+ #include <linux/jiffies.h>
+ #include <linux/kernel.h>
+ #include <linux/kref.h>
++#include <linux/list.h>
+ #include <linux/minmax.h>
+ #include <linux/netdevice.h>
+ #include <linux/nl80211.h>
+@@ -26,6 +27,7 @@
+ #include <linux/rcupdate.h>
+ #include <linux/rtnetlink.h>
+ #include <linux/skbuff.h>
++#include <linux/slab.h>
+ #include <linux/stddef.h>
+ #include <linux/string.h>
+ #include <linux/types.h>
+@@ -42,6 +44,18 @@
+ #include "send.h"
+ /**
++ * struct batadv_v_metric_queue_entry - list of hardif neighbors which require
++ *  and metric update
++ */
++struct batadv_v_metric_queue_entry {
++      /** @hardif_neigh: hardif neighbor scheduled for metric update */
++      struct batadv_hardif_neigh_node *hardif_neigh;
++
++      /** @list: list node for metric_queue */
++      struct list_head list;
++};
++
++/**
+  * batadv_v_elp_start_timer() - restart timer for ELP periodic work
+  * @hard_iface: the interface for which the timer has to be reset
+  */
+@@ -137,10 +151,17 @@ static bool batadv_v_elp_get_throughput(
+               goto default_throughput;
+       }
++      /* only use rtnl_trylock because the elp worker will be cancelled while
++       * the rntl_lock is held. the cancel_delayed_work_sync() would otherwise
++       * wait forever when the elp work_item was started and it is then also
++       * trying to rtnl_lock
++       */
++      if (!rtnl_trylock())
++              return false;
++
+       /* if not a wifi interface, check if this device provides data via
+        * ethtool (e.g. an Ethernet adapter)
+        */
+-      rtnl_lock();
+       ret = __ethtool_get_link_ksettings(hard_iface->net_dev, &link_settings);
+       rtnl_unlock();
+       if (ret == 0) {
+@@ -175,31 +196,19 @@ default_throughput:
+ /**
+  * batadv_v_elp_throughput_metric_update() - worker updating the throughput
+  *  metric of a single hop neighbour
+- * @work: the work queue item
++ * @neigh: the neighbour to probe
+  */
+-void batadv_v_elp_throughput_metric_update(struct work_struct *work)
++static void
++batadv_v_elp_throughput_metric_update(struct batadv_hardif_neigh_node *neigh)
+ {
+-      struct batadv_hardif_neigh_node_bat_v *neigh_bat_v;
+-      struct batadv_hardif_neigh_node *neigh;
+       u32 throughput;
+       bool valid;
+-      neigh_bat_v = container_of(work, struct batadv_hardif_neigh_node_bat_v,
+-                                 metric_work);
+-      neigh = container_of(neigh_bat_v, struct batadv_hardif_neigh_node,
+-                           bat_v);
+-
+       valid = batadv_v_elp_get_throughput(neigh, &throughput);
+       if (!valid)
+-              goto put_neigh;
++              return;
+       ewma_throughput_add(&neigh->bat_v.throughput, throughput);
+-
+-put_neigh:
+-      /* decrement refcounter to balance increment performed before scheduling
+-       * this task
+-       */
+-      batadv_hardif_neigh_put(neigh);
+ }
+ /**
+@@ -273,14 +282,16 @@ batadv_v_elp_wifi_neigh_probe(struct bat
+  */
+ static void batadv_v_elp_periodic_work(struct work_struct *work)
+ {
++      struct batadv_v_metric_queue_entry *metric_entry;
++      struct batadv_v_metric_queue_entry *metric_safe;
+       struct batadv_hardif_neigh_node *hardif_neigh;
+       struct batadv_hard_iface *hard_iface;
+       struct batadv_hard_iface_bat_v *bat_v;
+       struct batadv_elp_packet *elp_packet;
++      struct list_head metric_queue;
+       struct batadv_priv *bat_priv;
+       struct sk_buff *skb;
+       u32 elp_interval;
+-      bool ret;
+       bat_v = container_of(work, struct batadv_hard_iface_bat_v, elp_wq.work);
+       hard_iface = container_of(bat_v, struct batadv_hard_iface, bat_v);
+@@ -316,6 +327,8 @@ static void batadv_v_elp_periodic_work(s
+       atomic_inc(&hard_iface->bat_v.elp_seqno);
++      INIT_LIST_HEAD(&metric_queue);
++
+       /* The throughput metric is updated on each sent packet. This way, if a
+        * node is dead and no longer sends packets, batman-adv is still able to
+        * react timely to its death.
+@@ -340,16 +353,28 @@ static void batadv_v_elp_periodic_work(s
+               /* Reading the estimated throughput from cfg80211 is a task that
+                * may sleep and that is not allowed in an rcu protected
+-               * context. Therefore schedule a task for that.
++               * context. Therefore add it to metric_queue and process it
++               * outside rcu protected context.
+                */
+-              ret = queue_work(batadv_event_workqueue,
+-                               &hardif_neigh->bat_v.metric_work);
+-
+-              if (!ret)
++              metric_entry = kzalloc(sizeof(*metric_entry), GFP_ATOMIC);
++              if (!metric_entry) {
+                       batadv_hardif_neigh_put(hardif_neigh);
++                      continue;
++              }
++
++              metric_entry->hardif_neigh = hardif_neigh;
++              list_add(&metric_entry->list, &metric_queue);
+       }
+       rcu_read_unlock();
++      list_for_each_entry_safe(metric_entry, metric_safe, &metric_queue, list) {
++              batadv_v_elp_throughput_metric_update(metric_entry->hardif_neigh);
++
++              batadv_hardif_neigh_put(metric_entry->hardif_neigh);
++              list_del(&metric_entry->list);
++              kfree(metric_entry);
++      }
++
+ restart_timer:
+       batadv_v_elp_start_timer(hard_iface);
+ out:
+--- a/net/batman-adv/bat_v_elp.h
++++ b/net/batman-adv/bat_v_elp.h
+@@ -10,7 +10,6 @@
+ #include "main.h"
+ #include <linux/skbuff.h>
+-#include <linux/workqueue.h>
+ int batadv_v_elp_iface_enable(struct batadv_hard_iface *hard_iface);
+ void batadv_v_elp_iface_disable(struct batadv_hard_iface *hard_iface);
+@@ -19,6 +18,5 @@ void batadv_v_elp_iface_activate(struct
+ void batadv_v_elp_primary_iface_set(struct batadv_hard_iface *primary_iface);
+ int batadv_v_elp_packet_recv(struct sk_buff *skb,
+                            struct batadv_hard_iface *if_incoming);
+-void batadv_v_elp_throughput_metric_update(struct work_struct *work);
+ #endif /* _NET_BATMAN_ADV_BAT_V_ELP_H_ */
+--- a/net/batman-adv/types.h
++++ b/net/batman-adv/types.h
+@@ -596,9 +596,6 @@ struct batadv_hardif_neigh_node_bat_v {
+        *  neighbor
+        */
+       unsigned long last_unicast_tx;
+-
+-      /** @metric_work: work queue callback item for metric update */
+-      struct work_struct metric_work;
+ };
+ /**